{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import Graphics as MyGR\n",
    "import pickle\n",
    "import os\n",
    "import datetime\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.dates as matdates\n",
    "from matplotlib.ticker import MaxNLocator\n",
    "\n",
    "FolderList = [xx+\"Dropbox (Bank of Canada)\\\\Research Projects\\\\OHANK\\\\Empirics\\\\Analysis_Distribution\\\\\" \\\n",
    "              for xx in [\"D:\\\\\",\"B:\\\\\",\"/mnt/b/\"]]\n",
    "for Folder in FolderList:\n",
    "    if os.path.exists(Folder):\n",
    "        os.chdir(Folder)   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Read in the Data\n",
    "CleanedData = pickle.load(open('../Data/CFM_GQ/CFM_Cleaned.p','rb'))\n",
    "\n",
    "HH = CleanedData['HH']\n",
    "DS = CleanedData['Record']\n",
    "VL = CleanedData['ValueLabel']\n",
    "\n",
    "## Function to Query the Value Label\n",
    "def Fun_ValueLabelQuery(VarName, Code, VLDS=VL):\n",
    "    \"\"\"\n",
    "    VarName: Name of Variables to be Queried\n",
    "    Code: Code of the Variable \n",
    "    VLDS: Dataset of the Value Labels\n",
    "    \"\"\"\n",
    "    \n",
    "    if VarName in VLDS.index:\n",
    "        Temp = VLDS.loc[VarName,:]\n",
    "        CodeList = [Code] if  isinstance(Code, (int,float)) else Code\n",
    "        \n",
    "        SearchIdx = set(CodeList) & set(Temp.index)\n",
    "        if len(SearchIdx)<len(CodeList):\n",
    "            Warning('There are codes without labels.')\n",
    "        if len(SearchIdx)>0:\n",
    "            LabelList = [Temp.loc[ii,'Value'] if ii in SearchIdx else '' for ii in CodeList ]\n",
    "            if isinstance(Code,(int,float)):\n",
    "                return LabelList[0]\n",
    "            else:\n",
    "                return LabelList\n",
    "    else:\n",
    "        return Code\n",
    "        \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['WorkStatus_MaleHead', 'Weight', 'WorkStatus_FemalHead', 'Age_Head',\n",
       "       'Marry', 'Edu_FemalHead', 'Income_NW', 'Region', 'HhComposition',\n",
       "       'HhSize', 'OwnRent', 'Income_W', 'CitySize', 'Edu_MaleHead', 'Obs'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Add the Constant Variable for Frequency Aggregation\n",
    "HH['Obs'] = 1\n",
    "HH.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sample Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Unit Function to Tabulate the Frequency with Statistical Weight\n",
    "def WeightedTab(DS,Var):\n",
    "    Var_Time = 'period'\n",
    "    Var_Tab = 'Obs'\n",
    "    Var_Weight = 'Weight'\n",
    "    TempDS = DS.loc[:,[Var, Var_Tab, Var_Weight]]\n",
    "    TempDS['VW'] = TempDS[Var_Tab]*TempDS[Var_Weight]\n",
    "    VW = TempDS.groupby([Var_Time, Var])['VW'].sum().unstack()\n",
    "    W = TempDS.groupby(Var_Time)[Var_Weight].sum()\n",
    "    V = VW.div(W,axis=0)\n",
    "    \n",
    "    return V\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary Statistics over Time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "SumStat = {}\n",
    "\n",
    "def TempFun_SumStat(Var):\n",
    "    Temp = WeightedTab(HH,Var).sort_index(axis=1)\n",
    "    Temp = Temp.rename(columns={xx: Fun_ValueLabelQuery(Var,xx) for xx in Temp.columns})\n",
    "    return Temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summary statistics for Region... Done.\n",
      "Summary statistics for Income_W... Done.\n",
      "Summary statistics for HhSize... Done.\n",
      "Summary statistics for OwnRent... Done.\n",
      "Summary statistics for Age_Head... Done.\n"
     ]
    }
   ],
   "source": [
    "for Var in ['Region', 'Income_W', 'HhSize', 'OwnRent', 'Age_Head']:\n",
    "    print('Summary statistics for '+Var+'... ', end='')\n",
    "    SumStat[Var] = TempFun_SumStat(Var)\n",
    "    print('Done.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary Statistics on Average"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "DateLimit = (datetime.datetime(2014,10,1),datetime.datetime(2018,12,31))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The average summary statistics for  Region  is \n",
      "Region\n",
      "BRITISH COLUMBIA        0.131642\n",
      "ALBERTA                 0.103675\n",
      "SASKATCHEWAN            0.030088\n",
      "MANITOBA                0.035448\n",
      "ONTARIO                 0.367015\n",
      "QUEBEC                  0.257099\n",
      "NEW BRUNSWICK           0.021252\n",
      "PRINCE EDWARD ISLAND    0.004955\n",
      "NOVA SCOTIA             0.034628\n",
      "NEWFOUNDLAND            0.014198\n",
      "dtype: float64\n",
      "\n",
      "\n",
      "The average summary statistics for  Income_W  is \n",
      "Income_W\n",
      "Under $15,000         0.073207\n",
      "$15,000 - 19,999      0.047307\n",
      "$20,000 - 24,999      0.053210\n",
      "$25,000 - 29,999      0.043837\n",
      "$30,000 - 34,999      0.051297\n",
      "$35,000 - 44,999      0.096589\n",
      "$45,000 - 54,999      0.087193\n",
      "$55,000 - 59,999      0.040198\n",
      "$60,000 - 69,999      0.075815\n",
      "$70,000 - 99,999      0.219876\n",
      "$100,000 - 149,999    0.151774\n",
      "$150,000 +            0.059696\n",
      "dtype: float64\n",
      "\n",
      "\n",
      "The average summary statistics for  HhSize  is \n",
      "HhSize\n",
      "1    0.272981\n",
      "2    0.337386\n",
      "3    0.157696\n",
      "4    0.159935\n",
      "5    0.047708\n",
      "6    0.015807\n",
      "7    0.005215\n",
      "8    0.005093\n",
      "dtype: float64\n",
      "\n",
      "\n",
      "The average summary statistics for  OwnRent  is \n",
      "OwnRent\n",
      "Not Stated         NaN\n",
      "Own           0.693086\n",
      "Rent          0.306914\n",
      "dtype: float64\n",
      "\n",
      "\n",
      "The average summary statistics for  Age_Head  is \n",
      "Age_Head\n",
      "18    0.001306\n",
      "19    0.001944\n",
      "20    0.002501\n",
      "21    0.002656\n",
      "22    0.003461\n",
      "        ...   \n",
      "95    0.000520\n",
      "96    0.000775\n",
      "97    0.000978\n",
      "98    0.000341\n",
      "99    0.000981\n",
      "Length: 82, dtype: float64\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "AvgSumStatDict = {}\n",
    "for Var in SumStat.keys():\n",
    "    TempInd = (SumStat[Var].index>=DateLimit[0]) & (SumStat[Var].index<=DateLimit[1])\n",
    "    AvgSumStatDict[Var] = SumStat[Var].loc[TempInd,:].mean()\n",
    "    print('The average summary statistics for ', Var, ' is ')\n",
    "    print(AvgSumStatDict[Var])\n",
    "    print('\\n')\n",
    "\n",
    "pd.concat(AvgSumStatDict).to_excel('TableGraph/CFM_Sample_SumStat.xlsx')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.10 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  },
  "orig_nbformat": 3,
  "vscode": {
   "interpreter": {
    "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
